shhh <- suppressPackageStartupMessages # It's a library, so shhh!

shhh(library( mgcv ))
shhh(library(dplyr))
shhh(library(ggplot2))
shhh(library(lme4))
shhh(library(tidymv))
shhh(library(gamlss))
shhh(library(gsubfn))
shhh(library(lmerTest))
shhh(library(tidyverse))
shhh(library(boot))
shhh(library(rsample))
shhh(library(plotrix))
shhh(library(ggrepel))
shhh(library(mgcv))

theme_set(theme_bw())
options(digits=4)
options(scipen=999)
set.seed(444)
pipe_message = function(.data, status) {message(status); .data}

Read in MoTR Data


rate = 160

file_prefix = "/Users/cui/Desktop/motr_provo_160/"
fnames = list.files(path=file_prefix)

df = data.frame()
for (f in fnames) {
  temp = read.csv(paste0(file_prefix, "/", f)) %>%
    mutate(subj = str_remove(f, "_reading_measures.csv")) %>%
    rename(go_past_time = go_pass_time)
  df = rbind(df, temp)
}

# View(df)

filter_df = df %>%
  group_by(para_nr, subj) %>%
    summarise(correct = if_else(unique(correctness) == 1, 1, 0)) %>%
  ungroup() %>%
  drop_na() %>%
  group_by(subj) %>%
    summarise(p_correct = mean(correct)) %>%
  ungroup() %>%
  mutate(p_correct = round(p_correct, digits = 2))
`summarise()` has grouped output by 'para_nr'. You can override using the `.groups` argument.
filter_df = filter_df %>%
  filter(p_correct < 0.8)
# View(filter_df)
## reader_3:0.70, reader_60:0.79, reader_76:0.72 , reader_256:0.71 , reader_262:0.57 

raw_df = df %>%
  mutate(word = str_trim(word)) %>%
  mutate(subj = str_remove(subj, "reader_")) %>%
  mutate(subj = as.integer(subj)) %>%
  filter(! subj %in% c(3, 60, 76, 256, 262)) %>%
  mutate(FPReg = if_else(total_duration == 0, NA, FPReg),
         first_duration = if_else(total_duration == 0, NA, first_duration),
         gaze_duration = if_else(total_duration == 0, NA, gaze_duration),
         go_past_time = if_else(total_duration == 0, NA, go_past_time),
         skip = if_else(total_duration == 0, 1, 0)) %>%
  # See below for filtering out reading measures that are super long
  dplyr::select(expr_id, cond_id, para_nr, word, word_nr, first_duration, total_duration, gaze_duration, go_past_time, FPReg, skip, subj) #%>%
  # drop_na()

raw_df
# View(raw_df)
unique(raw_df$subj)
 [1]   1 100 101 102 103 104 105 107 108   2 255 259 261 263  53  54  55  56  57  58  61  62  63  64  65  68  70  71  72  73  74
[32]  75  79  80  82  83  88  89  90  91  93  94  96  97  98  99
length(unique(raw_df$subj))
[1] 46
# Average across subjects
motr_agg_df = raw_df %>%
  gather(metric, value, 6:11) %>%
    drop_na() %>%
    group_by(para_nr, word_nr, word, metric) %>%
    mutate(outlier = if_else(metric != "FPReg" & metric != "skip" & value > (mean(value) + 3 * sd(value) ), T, F)) %>%
    filter(outlier == F) %>%
  # # Filter out words with a reading-time of zero
  # mutate(zero = if_else(metric != "FPReg" & value == 0, T, F)) %>%
  # filter(zero == F) %>%
  drop_na() %>%
    summarise(value = mean(value),
              nsubj = length(unique(subj))) %>%
  ungroup() %>%
  arrange(para_nr, word_nr) %>%
  rename(
    text_id = para_nr,
    word_text_idx = word_nr,
    motr_value = value
  )
`summarise()` has grouped output by 'para_nr', 'word_nr', 'word'. You can override using the `.groups` argument.
motr_agg_df
# View(motr_agg_df)
# write.csv(motr_agg_df, file = "/Users/cui/Desktop/MoTR/pipeline/ancillary_data/motr_agg_df.csv", row.names = FALSE)

Comparison to Provo

# Read in Provo surprisal, frequency and length data
provo_modeling_df = read.csv("/Users/cui/Desktop/MoTR/pipeline/ancillary_data/provo_df.csv") %>%
  dplyr::select(text_id, sent_id, trigger_idx, word, freq, surp, len) %>%
  rename(word_idx = trigger_idx)

provo_modeling_df
# View(provo_modeling_df)
# Read in Provo eyetracking data

provo_raw_df = read.csv("/Users/cui/Desktop/MoTR/pipeline/ancillary_data/provo_eyetracking.csv")

# unique(provo_raw_df$Participant_ID)
# length(unique(provo_raw_df$Participant_ID))

provo_eyetracking_df = provo_raw_df %>%
  dplyr::select(Participant_ID, Text_ID, Sentence_Number, Word_In_Sentence_Number, Word, Word_Number, IA_FIRST_FIX_PROGRESSIVE, IA_FIRST_RUN_DWELL_TIME, IA_DWELL_TIME, IA_REGRESSION_PATH_DURATION, IA_REGRESSION_OUT, IA_SKIP) %>%
  rename( #first_duration = IA_FIRST_FIXATION_DURATION,   
          gaze_duration = IA_FIRST_RUN_DWELL_TIME,
          total_duration = IA_DWELL_TIME,
          go_past_time = IA_REGRESSION_PATH_DURATION,
          subj = Participant_ID,
          text_id = Text_ID,
          sent_id = Sentence_Number,
          word_idx = Word_In_Sentence_Number,
          word_text_idx = Word_Number,   # IA_ID?
          word = Word,      # Word?
          FPReg = IA_REGRESSION_OUT,
          skip = IA_SKIP,
          ff_progressive = IA_FIRST_FIX_PROGRESSIVE) %>%
  mutate(first_duration = gaze_duration) %>%
  mutate(gaze_duration = if_else(ff_progressive == 0, 0, gaze_duration),
         go_past_time = if_else(ff_progressive == 0, 0, go_past_time)) %>%
  dplyr::select(-ff_progressive) %>%
  # drop_na() %>%     # will drop the whole row with all the metrics
  gather(metric, value, 7:12) %>%
  # mutate(value = if_else(is.na(value), as.integer(0), as.integer(value))) %>%
  # mutate(value = if_else(metric != "FPReg" & is.na(value), as.integer(0), as.integer(value))) %>%
  drop_na() %>%
  mutate(word = str_trim(word)) %>%
  mutate(subj = str_remove(subj, "Sub")) %>%
  mutate(subj = as.integer(subj)) %>%
    group_by(text_id, word_text_idx, sent_id, word_idx, word, metric) %>%
    mutate(outlier = if_else(metric != "FPReg" & metric != "skip" & value > (mean(value) + 3 * sd(value) ), T, F)) %>%
    filter(outlier == F) %>%
  ungroup() #%>%
  # # Filter out words with a reading-time of zero
  # mutate(zero = if_else(metric != "FPReg" & value == 0,T, F)) %>%
  # filter(zero == F)

# Aggregate cross-participant data for all subjects
provo_eyetracking_agg_df = provo_eyetracking_df %>%
  group_by(text_id, word_text_idx, sent_id, word_idx, word, metric) %>%
    summarise(value = mean(value),
              nsubj = length(unique(subj))) %>%
    ungroup()
`summarise()` has grouped output by 'text_id', 'word_text_idx', 'sent_id', 'word_idx', 'word'. You can override using the `.groups` argument.
# View(provo_eyetracking_df)

# View(provo_eyetracking_agg_df)
# write.csv(provo_eyetracking_agg_df, file = "/Users/cui/Desktop/MoTR/pipeline/ancillary_data/provo_eyetracking_agg_df.csv", row.names = FALSE)

# Split the eyetracking data in two by subjects to see how well it correlates with itself
provo_eyetracking_subj1_df_temp = provo_eyetracking_df %>%
  filter(subj <= 42) %>%
  mutate(word_text_idx = as.integer(word_text_idx - 1)) %>%
  group_by(text_id, word_text_idx, sent_id, word_idx, word, metric) %>%
    summarise(value = mean(value)) %>%
  ungroup() %>%
  rename(value_1 = value) #%>%
`summarise()` has grouped output by 'text_id', 'word_text_idx', 'sent_id', 'word_idx', 'word'. You can override using the `.groups` argument.
  # dplyr::select(-sent_id, -word_idx)

# View(provo_eyetracking_subj1_df_temp)

provo_eyetracking_subj1_df = merge(provo_eyetracking_subj1_df_temp, motr_agg_df, by=c("text_id", "word_text_idx", "metric")) %>%
  arrange(text_id, sent_id, word_idx) %>%
  filter(!(text_id == 13 & word_text_idx >= 20 & word_text_idx <= 52)) %>%
  filter(!(text_id == 3 & word_text_idx >= 46 & word_text_idx <= 57)) %>%
  rename(word = word.y) %>%
  dplyr::select(text_id, word_text_idx, metric, word, value_1)

# View(provo_eyetracking_subj1_df)

provo_eyetracking_subj2_df = provo_eyetracking_df %>%
  filter(subj > 42) %>%
  mutate(word_text_idx = as.integer(word_text_idx - 1)) %>%
  group_by(text_id, word_text_idx, sent_id, word_idx, word, metric) %>%
    summarise(value = mean(value)) %>%
  ungroup() %>%
    rename(value_2 = value)%>%
  dplyr::select(-sent_id, -word_idx)
`summarise()` has grouped output by 'text_id', 'word_text_idx', 'sent_id', 'word_idx', 'word'. You can override using the `.groups` argument.
# View(provo_eyetracking_subj2_df)
  
provo_eyetr_grouped_df = merge(provo_eyetracking_subj2_df, provo_eyetracking_subj1_df, by=c("text_id", "word_text_idx", "metric")) %>%
  # filter(word.x == word.y) %>%
  dplyr::select(-word.y) %>%
  group_by(metric) %>%
    mutate(motr_outlier = if_else(metric != "FPReg" & metric != "skip" & value_1 > (mean(value_1) + 3 * sd(value_1) ), T, F)) %>%
    filter(motr_outlier == F) %>%
    mutate(eyetr_outlier = if_else(metric != "FPReg" & metric != "skip" & value_2 > (mean(value_2) + 3 * sd(value_2) ), T, F)) %>%
    filter(eyetr_outlier == F) %>%
  ungroup() %>%
  gather(measure, value, c("value_1", "value_2")) %>%
  dplyr::select(-motr_outlier, -eyetr_outlier)

# View(provo_eyetr_grouped_df)
provo_df = merge(provo_eyetracking_agg_df, provo_modeling_df, by=c("text_id", "sent_id", "word_idx")) %>%
  mutate(word_text_idx = as.integer(word_text_idx - 1)) %>%
  arrange(text_id, sent_id, word_idx) %>%
  rename(eyetr_value = value) 

provo_df = merge(provo_df, motr_agg_df, by=c("text_id", "word_text_idx", "metric")) %>%
arrange(text_id, sent_id, word_idx) %>%
  # almost all the word.x != word.y is because of normalization problem, so we can keep them, instead, deleting some special cases
filter(!(text_id == 13 & word_text_idx >= 20 & word_text_idx <= 52)) %>%
  filter(!(text_id == 3 & word_text_idx >= 46 & word_text_idx <= 57)) %>%
# filter(word.x == word) #%>%
dplyr::select(-word.x, -word.y) %>%
group_by(metric) %>%
  mutate(motr_outlier = if_else(metric != "FPReg" & motr_value > (mean(motr_value) + 3 * sd(motr_value) ), T, F)) %>%
  filter(motr_outlier == F) %>%
  mutate(eyetr_outlier = if_else(metric != "FPReg" & eyetr_value > (mean(eyetr_value) + 3 * sd(eyetr_value) ), T, F)) %>%
  filter(eyetr_outlier == F) %>%
ungroup() %>%
gather(measure, value, c("eyetr_value", "motr_value")) %>%
dplyr::select(-motr_outlier, -eyetr_outlier)
  
# View(provo_df)
# provo_df
provo_df %>%
  mutate(measure = if_else(measure == "eyetr_value", "Eyetracking Value", "MoTR Value")) %>%
  filter(metric != "FPReg" & metric != "skip") %>%
  ggplot(aes(x = value, color=metric)) +
    geom_density() +
    facet_wrap(.~measure, scales="free_y") +
    xlab("Reading Time (ms)")


# ggsave("../visualization/density.png", device = "png", width = 6, height = 2.5)
print("Gaze Duration")
[1] "Gaze Duration"
gd_df = provo_df %>% filter(metric == "gaze_duration") %>% spread(measure, value)
print(cor.test(gd_df$eyetr_value, gd_df$motr_value)$estimate)
   cor 
0.4822 
cor_df = provo_eyetr_grouped_df %>% filter(metric == "gaze_duration") %>% group_by(text_id, metric, measure) %>%
  summarize(value = mean(value, na.rm = TRUE), .groups = 'drop') %>% spread(measure, value)
print(cor.test(cor_df$value_1, cor_df$value_2)$estimate)
   cor 
0.6391 
print("First Duration")
[1] "First Duration"
fd_df = provo_df %>% filter(metric == "first_duration") %>% spread(measure, value)
print(cor.test(fd_df$eyetr_value, fd_df$motr_value)$estimate)
   cor 
0.4536 
# line 489, 490; 3147, 3148 share the same key
cor_df = provo_eyetr_grouped_df %>% filter(metric == "first_duration") %>% group_by(text_id, metric, measure) %>%
  summarize(value = mean(value, na.rm = TRUE), .groups = 'drop') %>% spread(measure, value)
print(cor.test(cor_df$value_1, cor_df$value_2)$estimate)
   cor 
0.8981 
print("Go Past Time")
[1] "Go Past Time"
gpt_df = provo_df %>% filter(metric == "go_past_time") %>% spread(measure, value)
print(cor.test(gpt_df$eyetr_value, gpt_df$motr_value)$estimate)
   cor 
0.3709 
cor_df = provo_eyetr_grouped_df %>% filter(metric == "go_past_time") %>% group_by(text_id, metric, measure) %>%
  summarize(value = mean(value, na.rm = TRUE), .groups = 'drop') %>% spread(measure, value)
print(cor.test(cor_df$value_1, cor_df$value_2)$estimate)
   cor 
0.6573 
print("Total Duration")
[1] "Total Duration"
td_df = provo_df %>% filter(metric == "total_duration") %>% spread(measure, value)
print(cor.test(td_df$eyetr_value, td_df$motr_value)$estimate)
   cor 
0.7601 
cor_df = provo_eyetr_grouped_df %>% filter(metric == "total_duration") %>% group_by(text_id, metric, measure) %>%
  summarize(value = mean(value, na.rm = TRUE), .groups = 'drop') %>% spread(measure, value)
print(cor.test(cor_df$value_1, cor_df$value_2)$estimate)
   cor 
0.9371 
print("Regression")
[1] "Regression"
reg_df = provo_df %>% filter(metric == "FPReg") %>% spread(measure, value)
print(cor.test(reg_df$eyetr_value, reg_df$motr_value)$estimate)
   cor 
0.2454 
cor_df = provo_eyetr_grouped_df %>% filter(metric == "FPReg") %>% group_by(text_id, metric, measure) %>%
  summarize(value = mean(value, na.rm = TRUE), .groups = 'drop') %>% spread(measure, value)
print(cor.test(cor_df$value_1, cor_df$value_2)$estimate)
   cor 
0.5558 
print("Skip")
[1] "Skip"
skip_df = provo_df %>% filter(metric == "skip") %>% spread(measure, value)
print(cor.test(skip_df$eyetr_value, skip_df$motr_value)$estimate)
  cor 
0.757 
cor_df = provo_eyetr_grouped_df %>% filter(metric == "skip") %>% group_by(text_id, metric, measure) %>%
  summarize(value = mean(value, na.rm = TRUE), .groups = 'drop') %>% spread(measure, value)
print(cor.test(cor_df$value_1, cor_df$value_2)$estimate)
   cor 
0.7544 

provo_df %>%
  filter(metric != "FPReg" & metric != "skip") %>%
  spread(measure, value) %>%
  ggplot(aes(x = motr_value, y=eyetr_value)) +
    geom_point(alpha = 0.05) +
    geom_abline(slope=1, intercept=0, color = "black") +
    #stat_summary_bin(bins=100, fun.data = "mean_cl_boot", size = 0.05) +
    facet_wrap(.~metric, scales = "free", nrow = 1) +
    coord_cartesian(ylim=c(0, 500), xlim=c(0, 500)) +
    geom_smooth()



# ggsave("../visualization/metric_cor.png", device = "png", width = 6, height = 2.5)

Correlations to Word-Level Statistical Properties

print("Len")
[1] "Len"
cor_df = provo_df %>% filter(metric == "gaze_duration") %>% spread(measure, value)
print(cor.test(cor_df$eyetr_value, cor_df$len)$estimate)
   cor 
0.7144 
print(cor.test(cor_df$motr_value, cor_df$len)$estimate)
   cor 
0.6182 
print("Freq")
[1] "Freq"
cor_df = provo_df %>% filter(metric == "gaze_duration") %>% spread(measure, value)
print(cor.test(cor_df$eyetr_value, cor_df$freq)$estimate)
    cor 
-0.6649 
print(cor.test(cor_df$motr_value, cor_df$freq)$estimate)
    cor 
-0.5246 
print("Surp")
[1] "Surp"
cor_df = provo_df %>% filter(metric == "gaze_duration") %>% spread(measure, value)
print(cor.test(cor_df$eyetr_value, cor_df$surp)$estimate)
   cor 
0.4676 
print(cor.test(cor_df$motr_value, cor_df$surp)$estimate)
   cor 
0.3558 
print("Len")
[1] "Len"
cor_df = provo_df %>% filter(metric == "total_duration") %>% spread(measure, value)
print(cor.test(cor_df$eyetr_value, cor_df$len)$estimate)
  cor 
0.825 
print(cor.test(cor_df$motr_value, cor_df$len)$estimate)
  cor 
0.838 
print("Freq")
[1] "Freq"
cor_df = provo_df %>% filter(metric == "total_duration") %>% spread(measure, value)
print(cor.test(cor_df$eyetr_value, cor_df$freq)$estimate)
   cor 
-0.781 
print(cor.test(cor_df$motr_value, cor_df$freq)$estimate)
    cor 
-0.7258 
print("Surp")
[1] "Surp"
cor_df = provo_df %>% filter(metric == "total_duration") %>% spread(measure, value)
print(cor.test(cor_df$eyetr_value, cor_df$surp)$estimate)
   cor 
0.5931 
print(cor.test(cor_df$motr_value, cor_df$surp)$estimate)
   cor 
0.5104 
print("Len")
[1] "Len"
cor_df = provo_df %>% filter(metric == "first_duration") %>% spread(measure, value)
print(cor.test(cor_df$eyetr_value, cor_df$len)$estimate)
   cor 
0.6562 
print(cor.test(cor_df$motr_value, cor_df$len)$estimate)
   cor 
0.6062 
print("Freq")
[1] "Freq"
cor_df = provo_df %>% filter(metric == "first_duration") %>% spread(measure, value)
print(cor.test(cor_df$eyetr_value, cor_df$freq)$estimate)
    cor 
-0.6242 
print(cor.test(cor_df$motr_value, cor_df$freq)$estimate)
    cor 
-0.5118 
print("Surp")
[1] "Surp"
cor_df = provo_df %>% filter(metric == "first_duration") %>% spread(measure, value)
print(cor.test(cor_df$eyetr_value, cor_df$surp)$estimate)
   cor 
0.4836 
print(cor.test(cor_df$motr_value, cor_df$surp)$estimate)
   cor 
0.3692 
print("Len")
[1] "Len"
cor_df = provo_df %>% filter(metric == "go_past_time") %>% spread(measure, value)
print(cor.test(cor_df$eyetr_value, cor_df$len)$estimate)
   cor 
0.6314 
print(cor.test(cor_df$motr_value, cor_df$len)$estimate)
   cor 
0.4132 
print("Freq")
[1] "Freq"
cor_df = provo_df %>% filter(metric == "go_past_time") %>% spread(measure, value)
print(cor.test(cor_df$eyetr_value, cor_df$freq)$estimate)
   cor 
-0.575 
print(cor.test(cor_df$motr_value, cor_df$freq)$estimate)
    cor 
-0.3696 
print("Surp")
[1] "Surp"
cor_df = provo_df %>% filter(metric == "go_past_time") %>% spread(measure, value)
print(cor.test(cor_df$eyetr_value, cor_df$surp)$estimate)
   cor 
0.4482 
print(cor.test(cor_df$motr_value, cor_df$surp)$estimate)
   cor 
0.2663 
provo_df %>%
  gather(word_prop, word_prop_val, c("freq", "len", "surp")) %>%
  mutate(measure = if_else(measure == "eyetr_value", "Eyetracking Value", "MoTR Value")) %>%
  mutate(word_prop = case_when(
    word_prop == "freq" ~ "Frequency",
    word_prop == "len" ~ "Length",
    word_prop == "surp" ~ "Surprisal"
  )) %>%
  filter(metric == "gaze_duration") %>%
  ggplot(aes(x = value, y=word_prop_val, color = measure)) +
    geom_point(alpha = 0.1) +
    facet_wrap(measure~word_prop, scales="free", strip.position = "right") +
    geom_smooth(color = "grey") +
    xlab("Reading Measure") +
  theme(
    legend.position = "none",
    strip.placement = "outside"
  )


# ggsave("../visualization/word_prop_comps.png", device = "png", width = 6, height = 3)
provo_df %>%
  ggplot(aes(x = value, y=freq, color=metric)) +
    geom_point(alpha = 0.1) +
    facet_grid(metric~measure, scales="free") +
    geom_smooth()

provo_df %>%
  ggplot(aes(x = value, y=surp, color=metric)) +
    geom_point(alpha = 0.2) +
    facet_grid(metric~measure, scales="free") +
    geom_smooth()

Shape of surprisal / RT relationship


fit_gam_inner = function(bootstrap_sample, mean_predictors) {
  
  df = bootstrap_sample$data
  weights = tabulate(as.integer(bootstrap_sample), nrow(df))
  
  m = gam(psychometric ~ s(surp, bs = 'cr', k = 6) + s(prev_surp, bs = 'cr', k = 6) + te(freq, len, bs = 'cr') + te(prev_freq, prev_len, bs = 'cr'), data = df, weights = weights)
  terms_to_predict = c("s(surp)", "s(prev_surp)")
  
  newdata = data.frame(surp=seq(0,20,by=0.1), prev_surp=mean_predictors$surp,
                       #surp=mean_predictors$surp, prev_surp=seq(0,20,by=0.1),
                       freq=mean_predictors$freq, prev_freq=mean_predictors$freq,
                       len=mean_predictors$len, prev_len=mean_predictors$len)
                       # len=mean_predictors$freq, prev_len=mean_predictors$freq) ##????

  # Returns a matrix N_samples * N_terms.
  per_term_predictions = predict(m, newdata=newdata, terms=terms_to_predict, type="terms")

  # Additive model -- sum across predictor response contributions (matrix columns).
  predictions = rowSums(per_term_predictions)
  
  # print("Fitted GAM model:")
  # print(m)

  # print("Predicted values for specified smooth terms:")
  # print(per_term_predictions)

  return(newdata %>% mutate(y=predictions))
}

fit_gam = function(df, mean_predictors, alpha=0.05) {
  # Bootstrap-resample data
  boot_models = df %>% bootstraps(times=10) %>% 
   # Fit a GAM and get predictions for each sample
    mutate(smoothed=map(splits, fit_gam_inner, mean_predictors=mean_predictors))
  
  # Extract mean and 5% and 95% percentile y-values for each surprisal value
  result = boot_models %>% 
    unnest(smoothed) %>% 
    dplyr::select(surp, y) %>%
    # dplyr::select(prev_surp, y) %>%
    group_by(surp) %>%
    # group_by(prev_surp) %>%
      summarise(y_lower=quantile(y, alpha / 2), 
                y_upper=quantile(y, 1 - alpha / 2),
                y=mean(y)) %>% 
    ungroup()
  # print(result)
  
  return (result)
}

gam_modeling_df = provo_df %>%
  spread(measure, value) %>%
  # mutate(len = nchar(word)) %>%  # len has already exists, but do not count punct into len.
  group_by(metric, text_id) %>%
    arrange(word_text_idx) %>%
    mutate(prev_surp = lag(surp),
           prev_freq = lag(freq),
           prev_len = lag(len),
           prev_eyetr_value = lag(eyetr_value)) %>%
  ungroup() %>%
  drop_na() %>%
  rename(psychometric = motr_value)
# View(gam_modeling_df)

smooths_df = data.frame()

metrics = c("gaze_duration", "total_duration", "go_past_time", "first_duration")
# metrics = c("gaze_duration")
for (m in metrics) {
  print(paste0("Fitting model for ", m))
  dummy_df = gam_modeling_df %>% filter(metric == m)
  mean_predictors = dummy_df %>% summarise(surp = mean(surp), len = mean(len), freq = mean(freq))
  smooths = dummy_df %>% fit_gam(., mean_predictors)
  #Fix 0 surprisal = 0 ms
  gam_smooths = smooths %>% mutate(delta = 0 - y[1], y=y + delta, y_lower= y_lower + delta, y_upper=y_upper + delta)
  smooths_df = rbind(smooths_df, gam_smooths %>% mutate(psychometric = m))
  # View(smooths_df)
}
[1] "Fitting model for gaze_duration"
[1] "Fitting model for total_duration"
[1] "Fitting model for go_past_time"
[1] "Fitting model for first_duration"

Get Density Data

get_d_points = function(df) {
    x = density(df$surp)$x
    y = density(df$surp)$y
    return(data.frame(x, y))
  }

density_data = data.frame()

for(m in c("gaze_duration", "total_duration", "go_past_time", "first_duration")) {
  dummy_df = provo_df %>% filter(metric == m) %>%
      do({get_d_points(.)}) %>%
      filter(x>0, x<20)
  density_data = rbind(density_data, dummy_df %>% mutate(metric=m))
}

# Surprisal curves
  ggplot() +
      # Density Data
      annotate("rect", xmin=0, xmax=20, ymin=-20,ymax=-10, fill="#f4f4f4", color="grey", alpha=1, size = 0) +
      geom_line(data = density_data, aes(x=x, y=y*50 - 18), color="#aaaaaa", size = 0.4) +
      # Surrp / Rt data
      # geom_line(data = smooths_df, aes(x=prev_surp, y=y, color = psychometric), size=0.7) +
      geom_line(data = smooths_df, aes(x=surp, y=y, color = psychometric), size=0.7) +
      # geom_ribbon(data = smooths_df, aes(x=prev_surp, ymin=y_lower, ymax=y_upper, fill = psychometric), alpha=0.3, size=0.5) +
      geom_ribbon(data = smooths_df, aes(x=surp, ymin=y_lower, ymax=y_upper, fill = psychometric), alpha=0.3, size=0.5) +
      scale_x_continuous(labels=c(0, 10, 20), breaks=c(0, 10, 20), minor_breaks = NULL) +
      facet_wrap(psychometric~., nrow = 1) +
      ylab("Slowdown due to Surprisal (ms)") +
      xlab("Surprisal of Word") +
      # ggtitle("MoTR Times and Previous Word Surprisal")
      ggtitle("MoTR Times and Current Word Surprisal")

  theme(
    legend.position = "none",
    panel.grid.minor = element_blank()
  )
List of 2
 $ legend.position : chr "none"
 $ panel.grid.minor: list()
  ..- attr(*, "class")= chr [1:2] "element_blank" "element"
 - attr(*, "class")= chr [1:2] "theme" "gg"
 - attr(*, "complete")= logi FALSE
 - attr(*, "validate")= logi TRUE
  
# ggsave("../visualization/surprisal_rt_link.png", device = "png", width = 6, height = 2.5)
fit_gam_inner_2 = function(bootstrap_sample, mean_predictors) {
  
  df = bootstrap_sample$data
  weights = tabulate(as.integer(bootstrap_sample), nrow(df))
  
  m = gam(psychometric ~ s(surp, bs = 'cr', k = 6) + s(prev_surp, bs = 'cr', k = 6) + te(freq, len, bs = 'cr') + te(prev_freq, prev_len, bs = 'cr'), data = df, weights = weights)
  terms_to_predict = c("s(surp)", "s(prev_surp)")
  
  newdata = data.frame(surp=mean_predictors$surp, prev_surp=seq(0,20,by=0.1),
                       freq=mean_predictors$freq, prev_freq=mean_predictors$freq,
                       len=mean_predictors$len, prev_len=mean_predictors$len)

  # Returns a matrix N_samples * N_terms.
  per_term_predictions = predict(m, newdata=newdata, terms=terms_to_predict, type="terms")

  # Additive model -- sum across predictor response contributions (matrix columns).
  predictions = rowSums(per_term_predictions)

  return(newdata %>% mutate(y=predictions))
}

fit_gam_2 = function(df, mean_predictors, alpha=0.05) {
  # Bootstrap-resample data
  boot_models = df %>% bootstraps(times=10) %>% 
   # Fit a GAM and get predictions for each sample
    mutate(smoothed=map(splits, fit_gam_inner_2, mean_predictors=mean_predictors))
  
  # Extract mean and 5% and 95% percentile y-values for each surprisal value
  result = boot_models %>% 
    unnest(smoothed) %>% 
    # dplyr::select(surp, y) %>%
    dplyr::select(prev_surp, y) %>%
    # group_by(surp) %>%
    group_by(prev_surp) %>%
      summarise(y_lower=quantile(y, alpha / 2), 
                y_upper=quantile(y, 1 - alpha / 2),
                y=mean(y)) %>% 
    ungroup()
  # print(result)
  
  return (result)
}
gam_modeling_df_2 = provo_df %>%
  spread(measure, value) %>%
  # mutate(len = nchar(word)) %>%  # len has already exists, but do not count punct into len.
  group_by(metric, text_id) %>%
    arrange(word_text_idx) %>%
    mutate(prev_surp = lag(surp),
           prev_freq = lag(freq),
           prev_len = lag(len),
           prev_eyetr_value = lag(eyetr_value)) %>%
  ungroup() %>%
  drop_na() %>%
  rename(psychometric = motr_value)

smooths_df = data.frame()

metrics = c("gaze_duration", "total_duration", "go_past_time", "first_duration")
for (m in metrics) {
  print(paste0("Fitting model for ", m))
  dummy_df = gam_modeling_df_2 %>% filter(metric == m)
  mean_predictors = dummy_df %>% summarise(surp = mean(surp), len = mean(len), freq = mean(freq))
  smooths = dummy_df %>% fit_gam_2(., mean_predictors)
  #Fix 0 surprisal = 0 ms
  gam_smooths = smooths %>% mutate(delta = 0 - y[1], y=y + delta, y_lower= y_lower + delta, y_upper=y_upper + delta)
  smooths_df = rbind(smooths_df, gam_smooths %>% mutate(psychometric = m))
}
[1] "Fitting model for gaze_duration"
[1] "Fitting model for total_duration"
[1] "Fitting model for go_past_time"
[1] "Fitting model for first_duration"
get_d_points = function(df) {
    x = density(df$surp)$x
    y = density(df$surp)$y
    return(data.frame(x, y))
  }

density_data = data.frame()

for(m in c("gaze_duration", "total_duration", "go_past_time", "first_duration")) {
  dummy_df = provo_df %>% filter(metric == m) %>%
      do({get_d_points(.)}) %>%
      filter(x>0, x<20)
  density_data = rbind(density_data, dummy_df %>% mutate(metric=m))
}
# Surprisal curves
  ggplot() +
      # Density Data
      annotate("rect", xmin=0, xmax=20, ymin=-20,ymax=-10, fill="#f4f4f4", color="grey", alpha=1, size = 0) +
      geom_line(data = density_data, aes(x=x, y=y*50 - 18), color="#aaaaaa", size = 0.4) +
      # Surrp / Rt data
      geom_line(data = smooths_df, aes(x=prev_surp, y=y, color = psychometric), size=0.7) +
      geom_ribbon(data = smooths_df, aes(x=prev_surp, ymin=y_lower, ymax=y_upper, fill = psychometric), alpha=0.3, size=0.5) +
      scale_x_continuous(labels=c(0, 10, 20), breaks=c(0, 10, 20), minor_breaks = NULL) +
      facet_wrap(psychometric~., nrow = 1) +
      ylab("Slowdown due to Surprisal (ms)") +
      xlab("Surprisal of Word") +
      ggtitle("MoTR Times and Previous Word Surprisal")

  theme(
    legend.position = "none",
    panel.grid.minor = element_blank()
  )
List of 2
 $ legend.position : chr "none"
 $ panel.grid.minor: list()
  ..- attr(*, "class")= chr [1:2] "element_blank" "element"
 - attr(*, "class")= chr [1:2] "theme" "gg"
 - attr(*, "complete")= logi FALSE
 - attr(*, "validate")= logi TRUE

precision and recall for FPReg

FPReg_df = provo_df %>% filter(metric == "FPReg") %>% spread(measure, value)
# write.csv(FPReg_df, file = "/Users/cui/Desktop/MoTR/pipeline/ancillary_data/FPReg.csv", row.names = FALSE)
confusion_matrix <- table(FPReg_df$motr_value > 0, FPReg_df$eyetr_value > 0)
confusion_matrix
       
        FALSE TRUE
  FALSE    79 1514
  TRUE     22  918
true_positives <- confusion_matrix[2, 2]
false_positives <- confusion_matrix[2, 1]
false_negatives <- confusion_matrix[1, 2]

precision <- true_positives / (true_positives + false_positives)
recall <- true_positives / (true_positives + false_negatives)

print("precision of Motr FPReg:")
[1] "precision of Motr FPReg:"
print(precision)
[1] 0.9766
print("Recall of Motr FPReg:")
[1] "Recall of Motr FPReg:"
print(recall)
[1] 0.3775

Relationship between first duration and regression probabilities. –> exploratory, not finished.

# Motr --> fit a linear model for first_duration. finding: More FPReg, more first duration. 

rlt_df = provo_df %>% filter(metric == "FPReg" | metric == "first_duration") %>% 
  filter(surp > 10) %>%
  spread(metric, value) %>%
  drop_na()

View(rlt_df)

motr_rlt = rlt_df %>% filter(measure == "motr_value") %>%
  mutate(FPReg_bi = if_else(FPReg == 0, 0, 1),
         fd_normalized = first_duration / len)

motr_rlt$FPReg_bi <- factor(motr_rlt$FPReg_bi)

View(motr_rlt)

# levels(motr_rlt$FPReg_bi)

# motr_lm = lm(first_duration ~ freq + FPReg_bi, data = motr_rlt)
motr_lm = lm(fd_normalized ~ FPReg, data = motr_rlt)

summary(motr_lm)

Call:
lm(formula = fd_normalized ~ FPReg, data = motr_rlt)

Residuals:
   Min     1Q Median     3Q    Max 
-41.21 -22.33  -9.31  11.75 100.63 

Coefficients:
            Estimate Std. Error t value            Pr(>|t|)    
(Intercept)    84.75       3.67   23.08 <0.0000000000000002 ***
FPReg         -44.00      27.22   -1.62                0.11    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 31.9 on 101 degrees of freedom
Multiple R-squared:  0.0252,    Adjusted R-squared:  0.0156 
F-statistic: 2.61 on 1 and 101 DF,  p-value: 0.109
# write.csv(rlt_df, file = "/Users/cui/Desktop/MoTR/pipeline/ancillary_data/rlt.csv", row.names = FALSE)
# eyrtr -> same as motr, more FPReg, more first_duration

eyetr_rlt = rlt_df %>% filter(measure == "eyetr_value") %>%
  mutate(FPReg_bi = if_else(FPReg == 0, 0, 1),
         fd_normalized = first_duration / len)

eyetr_rlt$FPReg_bi <- factor(eyetr_rlt$FPReg_bi)

# eyetr_lm = lm(first_duration ~ FPReg_bi, data = eyetr_rlt)
eyetr_lm = lm(fd_normalized ~ FPReg, data = eyetr_rlt)

summary(eyetr_lm)

Call:
lm(formula = fd_normalized ~ FPReg, data = eyetr_rlt)

Residuals:
   Min     1Q Median     3Q    Max 
-27.93 -15.95  -4.94   7.02  81.22 

Coefficients:
            Estimate Std. Error t value            Pr(>|t|)    
(Intercept)    52.46       3.57   14.70 <0.0000000000000002 ***
FPReg           6.83      18.89    0.36                0.72    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 22.3 on 101 degrees of freedom
Multiple R-squared:  0.00129,   Adjusted R-squared:  -0.0086 
F-statistic: 0.131 on 1 and 101 DF,  p-value: 0.718
---
title: "Exploratory Analysis for MoTR Reading Data"
output: html_notebook
---

```{r}
shhh <- suppressPackageStartupMessages # It's a library, so shhh!

shhh(library( mgcv ))
shhh(library(dplyr))
shhh(library(ggplot2))
shhh(library(lme4))
shhh(library(tidymv))
shhh(library(gamlss))
shhh(library(gsubfn))
shhh(library(lmerTest))
shhh(library(tidyverse))
shhh(library(boot))
shhh(library(rsample))
shhh(library(plotrix))
shhh(library(ggrepel))
shhh(library(mgcv))

theme_set(theme_bw())
options(digits=4)
options(scipen=999)
set.seed(444)
pipe_message = function(.data, status) {message(status); .data}

```


# Read in MoTR Data

```{r}

rate = 160

file_prefix = "/Users/cui/Desktop/motr_provo_160/"
fnames = list.files(path=file_prefix)

df = data.frame()
for (f in fnames) {
  temp = read.csv(paste0(file_prefix, "/", f)) %>%
    mutate(subj = str_remove(f, "_reading_measures.csv")) %>%
    rename(go_past_time = go_pass_time)
  df = rbind(df, temp)
}

# View(df)

filter_df = df %>%
  group_by(para_nr, subj) %>%
    summarise(correct = if_else(unique(correctness) == 1, 1, 0)) %>%
  ungroup() %>%
  drop_na() %>%
  group_by(subj) %>%
    summarise(p_correct = mean(correct)) %>%
  ungroup() %>%
  mutate(p_correct = round(p_correct, digits = 2))

filter_df = filter_df %>%
  filter(p_correct < 0.8)
# View(filter_df)
## reader_3:0.70, reader_60:0.79, reader_76:0.72 , reader_256:0.71 , reader_262:0.57 

raw_df = df %>%
  mutate(word = str_trim(word)) %>%
  mutate(subj = str_remove(subj, "reader_")) %>%
  mutate(subj = as.integer(subj)) %>%
  filter(! subj %in% c(3, 60, 76, 256, 262)) %>%
  mutate(FPReg = if_else(total_duration == 0, NA, FPReg),
         first_duration = if_else(total_duration == 0, NA, first_duration),
         gaze_duration = if_else(total_duration == 0, NA, gaze_duration),
         go_past_time = if_else(total_duration == 0, NA, go_past_time),
         skip = if_else(total_duration == 0, 1, 0)) %>%
  # See below for filtering out reading measures that are super long
  dplyr::select(expr_id, cond_id, para_nr, word, word_nr, first_duration, total_duration, gaze_duration, go_past_time, FPReg, skip, subj) #%>%
  # drop_na()

raw_df
# View(raw_df)

```

```{r}
unique(raw_df$subj)
length(unique(raw_df$subj))
```



```{r}
# Average across subjects
motr_agg_df = raw_df %>%
  gather(metric, value, 6:11) %>%
    drop_na() %>%
    group_by(para_nr, word_nr, word, metric) %>%
    mutate(outlier = if_else(metric != "FPReg" & metric != "skip" & value > (mean(value) + 3 * sd(value) ), T, F)) %>%
    filter(outlier == F) %>%
  # # Filter out words with a reading-time of zero
  # mutate(zero = if_else(metric != "FPReg" & value == 0, T, F)) %>%
  # filter(zero == F) %>%
  drop_na() %>%
    summarise(value = mean(value),
              nsubj = length(unique(subj))) %>%
  ungroup() %>%
  arrange(para_nr, word_nr) %>%
  rename(
    text_id = para_nr,
    word_text_idx = word_nr,
    motr_value = value
  )

motr_agg_df
# View(motr_agg_df)
# write.csv(motr_agg_df, file = "/Users/cui/Desktop/MoTR/pipeline/ancillary_data/motr_agg_df.csv", row.names = FALSE)

```




# Comparison to Provo


```{r}
# Read in Provo surprisal, frequency and length data
provo_modeling_df = read.csv("/Users/cui/Desktop/MoTR/pipeline/ancillary_data/provo_df.csv") %>%
  dplyr::select(text_id, sent_id, trigger_idx, word, freq, surp, len) %>%
  rename(word_idx = trigger_idx)

provo_modeling_df
# View(provo_modeling_df)

```

```{r}
# Read in Provo eyetracking data

provo_raw_df = read.csv("/Users/cui/Desktop/MoTR/pipeline/ancillary_data/provo_eyetracking.csv")

# unique(provo_raw_df$Participant_ID)
# length(unique(provo_raw_df$Participant_ID))

provo_eyetracking_df = provo_raw_df %>%
  dplyr::select(Participant_ID, Text_ID, Sentence_Number, Word_In_Sentence_Number, Word, Word_Number, IA_FIRST_FIX_PROGRESSIVE, IA_FIRST_RUN_DWELL_TIME, IA_DWELL_TIME, IA_REGRESSION_PATH_DURATION, IA_REGRESSION_OUT, IA_SKIP) %>%
  rename( #first_duration = IA_FIRST_FIXATION_DURATION,   
          gaze_duration = IA_FIRST_RUN_DWELL_TIME,
          total_duration = IA_DWELL_TIME,
          go_past_time = IA_REGRESSION_PATH_DURATION,
          subj = Participant_ID,
          text_id = Text_ID,
          sent_id = Sentence_Number,
          word_idx = Word_In_Sentence_Number,
          word_text_idx = Word_Number,   # IA_ID?
          word = Word,      # Word?
          FPReg = IA_REGRESSION_OUT,
          skip = IA_SKIP,
          ff_progressive = IA_FIRST_FIX_PROGRESSIVE) %>%
  mutate(first_duration = gaze_duration) %>%
  mutate(gaze_duration = if_else(ff_progressive == 0, 0, gaze_duration),
         go_past_time = if_else(ff_progressive == 0, 0, go_past_time)) %>%
  dplyr::select(-ff_progressive) %>%
  # drop_na() %>%     # will drop the whole row with all the metrics
  gather(metric, value, 7:12) %>%
  # mutate(value = if_else(is.na(value), as.integer(0), as.integer(value))) %>%
  # mutate(value = if_else(metric != "FPReg" & is.na(value), as.integer(0), as.integer(value))) %>%
  drop_na() %>%
  mutate(word = str_trim(word)) %>%
  mutate(subj = str_remove(subj, "Sub")) %>%
  mutate(subj = as.integer(subj)) %>%
    group_by(text_id, word_text_idx, sent_id, word_idx, word, metric) %>%
    mutate(outlier = if_else(metric != "FPReg" & metric != "skip" & value > (mean(value) + 3 * sd(value) ), T, F)) %>%
    filter(outlier == F) %>%
  ungroup() #%>%
  # # Filter out words with a reading-time of zero
  # mutate(zero = if_else(metric != "FPReg" & value == 0,T, F)) %>%
  # filter(zero == F)

# Aggregate cross-participant data for all subjects
provo_eyetracking_agg_df = provo_eyetracking_df %>%
  group_by(text_id, word_text_idx, sent_id, word_idx, word, metric) %>%
    summarise(value = mean(value),
              nsubj = length(unique(subj))) %>%
    ungroup()

# View(provo_eyetracking_df)

# View(provo_eyetracking_agg_df)
# write.csv(provo_eyetracking_agg_df, file = "/Users/cui/Desktop/MoTR/pipeline/ancillary_data/provo_eyetracking_agg_df.csv", row.names = FALSE)

```

```{r}

# Split the eyetracking data in two by subjects to see how well it correlates with itself
provo_eyetracking_subj1_df_temp = provo_eyetracking_df %>%
  filter(subj <= 42) %>%
  mutate(word_text_idx = as.integer(word_text_idx - 1)) %>%
  group_by(text_id, word_text_idx, sent_id, word_idx, word, metric) %>%
    summarise(value = mean(value)) %>%
  ungroup() %>%
  rename(value_1 = value) #%>%
  # dplyr::select(-sent_id, -word_idx)

# View(provo_eyetracking_subj1_df_temp)

provo_eyetracking_subj1_df = merge(provo_eyetracking_subj1_df_temp, motr_agg_df, by=c("text_id", "word_text_idx", "metric")) %>%
  arrange(text_id, sent_id, word_idx) %>%
  filter(!(text_id == 13 & word_text_idx >= 20 & word_text_idx <= 52)) %>%
  filter(!(text_id == 3 & word_text_idx >= 46 & word_text_idx <= 57)) %>%
  rename(word = word.y) %>%
  dplyr::select(text_id, word_text_idx, metric, word, value_1)

# View(provo_eyetracking_subj1_df)

provo_eyetracking_subj2_df = provo_eyetracking_df %>%
  filter(subj > 42) %>%
  mutate(word_text_idx = as.integer(word_text_idx - 1)) %>%
  group_by(text_id, word_text_idx, sent_id, word_idx, word, metric) %>%
    summarise(value = mean(value)) %>%
  ungroup() %>%
    rename(value_2 = value)%>%
  dplyr::select(-sent_id, -word_idx)

# View(provo_eyetracking_subj2_df)
  
provo_eyetr_grouped_df = merge(provo_eyetracking_subj2_df, provo_eyetracking_subj1_df, by=c("text_id", "word_text_idx", "metric")) %>%
  # filter(word.x == word.y) %>%
  dplyr::select(-word.y) %>%
  group_by(metric) %>%
    mutate(motr_outlier = if_else(metric != "FPReg" & metric != "skip" & value_1 > (mean(value_1) + 3 * sd(value_1) ), T, F)) %>%
    filter(motr_outlier == F) %>%
    mutate(eyetr_outlier = if_else(metric != "FPReg" & metric != "skip" & value_2 > (mean(value_2) + 3 * sd(value_2) ), T, F)) %>%
    filter(eyetr_outlier == F) %>%
  ungroup() %>%
  gather(measure, value, c("value_1", "value_2")) %>%
  dplyr::select(-motr_outlier, -eyetr_outlier)

# View(provo_eyetr_grouped_df)

```


```{r}
provo_df = merge(provo_eyetracking_agg_df, provo_modeling_df, by=c("text_id", "sent_id", "word_idx")) %>%
  mutate(word_text_idx = as.integer(word_text_idx - 1)) %>%
  arrange(text_id, sent_id, word_idx) %>%
  rename(eyetr_value = value) 

provo_df = merge(provo_df, motr_agg_df, by=c("text_id", "word_text_idx", "metric")) %>%
arrange(text_id, sent_id, word_idx) %>%
  # almost all the word.x != word.y is because of normalization problem, so we can keep them, instead, deleting some special cases
filter(!(text_id == 13 & word_text_idx >= 20 & word_text_idx <= 52)) %>%
  filter(!(text_id == 3 & word_text_idx >= 46 & word_text_idx <= 57)) %>%
# filter(word.x == word) #%>%
dplyr::select(-word.x, -word.y) %>%
group_by(metric) %>%
  mutate(motr_outlier = if_else(metric != "FPReg" & motr_value > (mean(motr_value) + 3 * sd(motr_value) ), T, F)) %>%
  filter(motr_outlier == F) %>%
  mutate(eyetr_outlier = if_else(metric != "FPReg" & eyetr_value > (mean(eyetr_value) + 3 * sd(eyetr_value) ), T, F)) %>%
  filter(eyetr_outlier == F) %>%
ungroup() %>%
gather(measure, value, c("eyetr_value", "motr_value")) %>%
dplyr::select(-motr_outlier, -eyetr_outlier)
  
# View(provo_df)
# provo_df
```

```{r}
provo_df %>%
  mutate(measure = if_else(measure == "eyetr_value", "Eyetracking Value", "MoTR Value")) %>%
  filter(metric != "FPReg" & metric != "skip") %>%
  ggplot(aes(x = value, color=metric)) +
    geom_density() +
    facet_wrap(.~measure, scales="free_y") +
    xlab("Reading Time (ms)")

# ggsave("../visualization/density.png", device = "png", width = 6, height = 2.5)
```


```{r}
print("Gaze Duration")
gd_df = provo_df %>% filter(metric == "gaze_duration") %>% spread(measure, value)
print(cor.test(gd_df$eyetr_value, gd_df$motr_value)$estimate)

cor_df = provo_eyetr_grouped_df %>% filter(metric == "gaze_duration") %>% group_by(text_id, metric, measure) %>%
  summarize(value = mean(value, na.rm = TRUE), .groups = 'drop') %>% spread(measure, value)
print(cor.test(cor_df$value_1, cor_df$value_2)$estimate)

print("First Duration")
fd_df = provo_df %>% filter(metric == "first_duration") %>% spread(measure, value)
print(cor.test(fd_df$eyetr_value, fd_df$motr_value)$estimate)

# line 489, 490; 3147, 3148 share the same key
cor_df = provo_eyetr_grouped_df %>% filter(metric == "first_duration") %>% group_by(text_id, metric, measure) %>%
  summarize(value = mean(value, na.rm = TRUE), .groups = 'drop') %>% spread(measure, value)
print(cor.test(cor_df$value_1, cor_df$value_2)$estimate)

print("Go Past Time")

gpt_df = provo_df %>% filter(metric == "go_past_time") %>% spread(measure, value)
print(cor.test(gpt_df$eyetr_value, gpt_df$motr_value)$estimate)

cor_df = provo_eyetr_grouped_df %>% filter(metric == "go_past_time") %>% group_by(text_id, metric, measure) %>%
  summarize(value = mean(value, na.rm = TRUE), .groups = 'drop') %>% spread(measure, value)
print(cor.test(cor_df$value_1, cor_df$value_2)$estimate)

print("Total Duration")

td_df = provo_df %>% filter(metric == "total_duration") %>% spread(measure, value)
print(cor.test(td_df$eyetr_value, td_df$motr_value)$estimate)

cor_df = provo_eyetr_grouped_df %>% filter(metric == "total_duration") %>% group_by(text_id, metric, measure) %>%
  summarize(value = mean(value, na.rm = TRUE), .groups = 'drop') %>% spread(measure, value)
print(cor.test(cor_df$value_1, cor_df$value_2)$estimate)

print("Regression")

reg_df = provo_df %>% filter(metric == "FPReg") %>% spread(measure, value)
print(cor.test(reg_df$eyetr_value, reg_df$motr_value)$estimate)

cor_df = provo_eyetr_grouped_df %>% filter(metric == "FPReg") %>% group_by(text_id, metric, measure) %>%
  summarize(value = mean(value, na.rm = TRUE), .groups = 'drop') %>% spread(measure, value)
print(cor.test(cor_df$value_1, cor_df$value_2)$estimate)

print("Skip")
skip_df = provo_df %>% filter(metric == "skip") %>% spread(measure, value)
print(cor.test(skip_df$eyetr_value, skip_df$motr_value)$estimate)

cor_df = provo_eyetr_grouped_df %>% filter(metric == "skip") %>% group_by(text_id, metric, measure) %>%
  summarize(value = mean(value, na.rm = TRUE), .groups = 'drop') %>% spread(measure, value)
print(cor.test(cor_df$value_1, cor_df$value_2)$estimate)


```



```{r}

provo_df %>%
  filter(metric != "FPReg" & metric != "skip") %>%
  spread(measure, value) %>%
  ggplot(aes(x = motr_value, y=eyetr_value)) +
    geom_point(alpha = 0.05) +
    geom_abline(slope=1, intercept=0, color = "black") +
    #stat_summary_bin(bins=100, fun.data = "mean_cl_boot", size = 0.05) +
    facet_wrap(.~metric, scales = "free", nrow = 1) +
    coord_cartesian(ylim=c(0, 500), xlim=c(0, 500)) +
    geom_smooth()


# ggsave("../visualization/metric_cor.png", device = "png", width = 6, height = 2.5)
```
## Correlations to Word-Level Statistical Properties

```{r}
print("Len")
cor_df = provo_df %>% filter(metric == "gaze_duration") %>% spread(measure, value)
print(cor.test(cor_df$eyetr_value, cor_df$len)$estimate)
print(cor.test(cor_df$motr_value, cor_df$len)$estimate)

print("Freq")
cor_df = provo_df %>% filter(metric == "gaze_duration") %>% spread(measure, value)
print(cor.test(cor_df$eyetr_value, cor_df$freq)$estimate)
print(cor.test(cor_df$motr_value, cor_df$freq)$estimate)

print("Surp")
cor_df = provo_df %>% filter(metric == "gaze_duration") %>% spread(measure, value)
print(cor.test(cor_df$eyetr_value, cor_df$surp)$estimate)
print(cor.test(cor_df$motr_value, cor_df$surp)$estimate)


```

```{r}
print("Len")
cor_df = provo_df %>% filter(metric == "total_duration") %>% spread(measure, value)
print(cor.test(cor_df$eyetr_value, cor_df$len)$estimate)
print(cor.test(cor_df$motr_value, cor_df$len)$estimate)

print("Freq")
cor_df = provo_df %>% filter(metric == "total_duration") %>% spread(measure, value)
print(cor.test(cor_df$eyetr_value, cor_df$freq)$estimate)
print(cor.test(cor_df$motr_value, cor_df$freq)$estimate)

print("Surp")
cor_df = provo_df %>% filter(metric == "total_duration") %>% spread(measure, value)
print(cor.test(cor_df$eyetr_value, cor_df$surp)$estimate)
print(cor.test(cor_df$motr_value, cor_df$surp)$estimate)
```

```{r}
print("Len")
cor_df = provo_df %>% filter(metric == "first_duration") %>% spread(measure, value)
print(cor.test(cor_df$eyetr_value, cor_df$len)$estimate)
print(cor.test(cor_df$motr_value, cor_df$len)$estimate)

print("Freq")
cor_df = provo_df %>% filter(metric == "first_duration") %>% spread(measure, value)
print(cor.test(cor_df$eyetr_value, cor_df$freq)$estimate)
print(cor.test(cor_df$motr_value, cor_df$freq)$estimate)

print("Surp")
cor_df = provo_df %>% filter(metric == "first_duration") %>% spread(measure, value)
print(cor.test(cor_df$eyetr_value, cor_df$surp)$estimate)
print(cor.test(cor_df$motr_value, cor_df$surp)$estimate)
```

```{r}
print("Len")
cor_df = provo_df %>% filter(metric == "go_past_time") %>% spread(measure, value)
print(cor.test(cor_df$eyetr_value, cor_df$len)$estimate)
print(cor.test(cor_df$motr_value, cor_df$len)$estimate)

print("Freq")
cor_df = provo_df %>% filter(metric == "go_past_time") %>% spread(measure, value)
print(cor.test(cor_df$eyetr_value, cor_df$freq)$estimate)
print(cor.test(cor_df$motr_value, cor_df$freq)$estimate)

print("Surp")
cor_df = provo_df %>% filter(metric == "go_past_time") %>% spread(measure, value)
print(cor.test(cor_df$eyetr_value, cor_df$surp)$estimate)
print(cor.test(cor_df$motr_value, cor_df$surp)$estimate)
```



```{r}
provo_df %>%
  gather(word_prop, word_prop_val, c("freq", "len", "surp")) %>%
  mutate(measure = if_else(measure == "eyetr_value", "Eyetracking Value", "MoTR Value")) %>%
  mutate(word_prop = case_when(
    word_prop == "freq" ~ "Frequency",
    word_prop == "len" ~ "Length",
    word_prop == "surp" ~ "Surprisal"
  )) %>%
  filter(metric == "gaze_duration") %>%
  ggplot(aes(x = value, y=word_prop_val, color = measure)) +
    geom_point(alpha = 0.1) +
    facet_wrap(measure~word_prop, scales="free", strip.position = "right") +
    geom_smooth(color = "grey") +
    xlab("Reading Measure") +
  theme(
    legend.position = "none",
    strip.placement = "outside"
  )

# ggsave("../visualization/word_prop_comps.png", device = "png", width = 6, height = 3)

```

```{r}
provo_df %>%
  ggplot(aes(x = value, y=freq, color=metric)) +
    geom_point(alpha = 0.1) +
    facet_grid(metric~measure, scales="free") +
    geom_smooth()
```

```{r}
provo_df %>%
  ggplot(aes(x = value, y=surp, color=metric)) +
    geom_point(alpha = 0.2) +
    facet_grid(metric~measure, scales="free") +
    geom_smooth()
```


## Shape of surprisal / RT relationship

```{r}

fit_gam_inner = function(bootstrap_sample, mean_predictors) {
  
  df = bootstrap_sample$data
  weights = tabulate(as.integer(bootstrap_sample), nrow(df))
  
  m = gam(psychometric ~ s(surp, bs = 'cr', k = 6) + s(prev_surp, bs = 'cr', k = 6) + te(freq, len, bs = 'cr') + te(prev_freq, prev_len, bs = 'cr'), data = df, weights = weights)
  terms_to_predict = c("s(surp)", "s(prev_surp)")
  
  newdata = data.frame(surp=seq(0,20,by=0.1), prev_surp=mean_predictors$surp,
                       #surp=mean_predictors$surp, prev_surp=seq(0,20,by=0.1),
                       freq=mean_predictors$freq, prev_freq=mean_predictors$freq,
                       len=mean_predictors$len, prev_len=mean_predictors$len)
                       # len=mean_predictors$freq, prev_len=mean_predictors$freq) ##????

  # Returns a matrix N_samples * N_terms.
  per_term_predictions = predict(m, newdata=newdata, terms=terms_to_predict, type="terms")

  # Additive model -- sum across predictor response contributions (matrix columns).
  predictions = rowSums(per_term_predictions)
  
  # print("Fitted GAM model:")
  # print(m)

  # print("Predicted values for specified smooth terms:")
  # print(per_term_predictions)

  return(newdata %>% mutate(y=predictions))
}

fit_gam = function(df, mean_predictors, alpha=0.05) {
  # Bootstrap-resample data
  boot_models = df %>% bootstraps(times=10) %>% 
   # Fit a GAM and get predictions for each sample
    mutate(smoothed=map(splits, fit_gam_inner, mean_predictors=mean_predictors))
  
  # Extract mean and 5% and 95% percentile y-values for each surprisal value
  result = boot_models %>% 
    unnest(smoothed) %>% 
    dplyr::select(surp, y) %>%
    # dplyr::select(prev_surp, y) %>%
    group_by(surp) %>%
    # group_by(prev_surp) %>%
      summarise(y_lower=quantile(y, alpha / 2), 
                y_upper=quantile(y, 1 - alpha / 2),
                y=mean(y)) %>% 
    ungroup()
  # print(result)
  
  return (result)
}

```


```{r}

gam_modeling_df = provo_df %>%
  spread(measure, value) %>%
  # mutate(len = nchar(word)) %>%  # len has already exists, but do not count punct into len.
  group_by(metric, text_id) %>%
    arrange(word_text_idx) %>%
    mutate(prev_surp = lag(surp),
           prev_freq = lag(freq),
           prev_len = lag(len),
           prev_eyetr_value = lag(eyetr_value)) %>%
  ungroup() %>%
  drop_na() %>%
  rename(psychometric = motr_value)
# View(gam_modeling_df)

smooths_df = data.frame()

metrics = c("gaze_duration", "total_duration", "go_past_time", "first_duration")
# metrics = c("gaze_duration")
for (m in metrics) {
  print(paste0("Fitting model for ", m))
  dummy_df = gam_modeling_df %>% filter(metric == m)
  mean_predictors = dummy_df %>% summarise(surp = mean(surp), len = mean(len), freq = mean(freq))
  smooths = dummy_df %>% fit_gam(., mean_predictors)
  #Fix 0 surprisal = 0 ms
  gam_smooths = smooths %>% mutate(delta = 0 - y[1], y=y + delta, y_lower= y_lower + delta, y_upper=y_upper + delta)
  smooths_df = rbind(smooths_df, gam_smooths %>% mutate(psychometric = m))
  # View(smooths_df)
}

```

### Get Density Data

```{r}
get_d_points = function(df) {
    x = density(df$surp)$x
    y = density(df$surp)$y
    return(data.frame(x, y))
  }

density_data = data.frame()

for(m in c("gaze_duration", "total_duration", "go_past_time", "first_duration")) {
  dummy_df = provo_df %>% filter(metric == m) %>%
      do({get_d_points(.)}) %>%
      filter(x>0, x<20)
  density_data = rbind(density_data, dummy_df %>% mutate(metric=m))
}

```


```{r}

# Surprisal curves
  ggplot() +
      # Density Data
      annotate("rect", xmin=0, xmax=20, ymin=-20,ymax=-10, fill="#f4f4f4", color="grey", alpha=1, size = 0) +
      geom_line(data = density_data, aes(x=x, y=y*50 - 18), color="#aaaaaa", size = 0.4) +
      # Surrp / Rt data
      # geom_line(data = smooths_df, aes(x=prev_surp, y=y, color = psychometric), size=0.7) +
      geom_line(data = smooths_df, aes(x=surp, y=y, color = psychometric), size=0.7) +
      # geom_ribbon(data = smooths_df, aes(x=prev_surp, ymin=y_lower, ymax=y_upper, fill = psychometric), alpha=0.3, size=0.5) +
      geom_ribbon(data = smooths_df, aes(x=surp, ymin=y_lower, ymax=y_upper, fill = psychometric), alpha=0.3, size=0.5) +
      scale_x_continuous(labels=c(0, 10, 20), breaks=c(0, 10, 20), minor_breaks = NULL) +
      facet_wrap(psychometric~., nrow = 1) +
      ylab("Slowdown due to Surprisal (ms)") +
      xlab("Surprisal of Word") +
      # ggtitle("MoTR Times and Previous Word Surprisal")
      ggtitle("MoTR Times and Current Word Surprisal")
  theme(
    legend.position = "none",
    panel.grid.minor = element_blank()
  )
  
# ggsave("../visualization/surprisal_rt_link.png", device = "png", width = 6, height = 2.5)


```


```{r}
fit_gam_inner_2 = function(bootstrap_sample, mean_predictors) {
  
  df = bootstrap_sample$data
  weights = tabulate(as.integer(bootstrap_sample), nrow(df))
  
  m = gam(psychometric ~ s(surp, bs = 'cr', k = 6) + s(prev_surp, bs = 'cr', k = 6) + te(freq, len, bs = 'cr') + te(prev_freq, prev_len, bs = 'cr'), data = df, weights = weights)
  terms_to_predict = c("s(surp)", "s(prev_surp)")
  
  newdata = data.frame(surp=mean_predictors$surp, prev_surp=seq(0,20,by=0.1),
                       freq=mean_predictors$freq, prev_freq=mean_predictors$freq,
                       len=mean_predictors$len, prev_len=mean_predictors$len)

  # Returns a matrix N_samples * N_terms.
  per_term_predictions = predict(m, newdata=newdata, terms=terms_to_predict, type="terms")

  # Additive model -- sum across predictor response contributions (matrix columns).
  predictions = rowSums(per_term_predictions)

  return(newdata %>% mutate(y=predictions))
}

fit_gam_2 = function(df, mean_predictors, alpha=0.05) {
  # Bootstrap-resample data
  boot_models = df %>% bootstraps(times=10) %>% 
   # Fit a GAM and get predictions for each sample
    mutate(smoothed=map(splits, fit_gam_inner_2, mean_predictors=mean_predictors))
  
  # Extract mean and 5% and 95% percentile y-values for each surprisal value
  result = boot_models %>% 
    unnest(smoothed) %>% 
    # dplyr::select(surp, y) %>%
    dplyr::select(prev_surp, y) %>%
    # group_by(surp) %>%
    group_by(prev_surp) %>%
      summarise(y_lower=quantile(y, alpha / 2), 
                y_upper=quantile(y, 1 - alpha / 2),
                y=mean(y)) %>% 
    ungroup()
  # print(result)
  
  return (result)
}
```


```{r}
gam_modeling_df_2 = provo_df %>%
  spread(measure, value) %>%
  # mutate(len = nchar(word)) %>%  # len has already exists, but do not count punct into len.
  group_by(metric, text_id) %>%
    arrange(word_text_idx) %>%
    mutate(prev_surp = lag(surp),
           prev_freq = lag(freq),
           prev_len = lag(len),
           prev_eyetr_value = lag(eyetr_value)) %>%
  ungroup() %>%
  drop_na() %>%
  rename(psychometric = motr_value)

smooths_df = data.frame()

metrics = c("gaze_duration", "total_duration", "go_past_time", "first_duration")
for (m in metrics) {
  print(paste0("Fitting model for ", m))
  dummy_df = gam_modeling_df_2 %>% filter(metric == m)
  mean_predictors = dummy_df %>% summarise(surp = mean(surp), len = mean(len), freq = mean(freq))
  smooths = dummy_df %>% fit_gam_2(., mean_predictors)
  #Fix 0 surprisal = 0 ms
  gam_smooths = smooths %>% mutate(delta = 0 - y[1], y=y + delta, y_lower= y_lower + delta, y_upper=y_upper + delta)
  smooths_df = rbind(smooths_df, gam_smooths %>% mutate(psychometric = m))
}
```

```{r}
get_d_points = function(df) {
    x = density(df$surp)$x
    y = density(df$surp)$y
    return(data.frame(x, y))
  }

density_data = data.frame()

for(m in c("gaze_duration", "total_duration", "go_past_time", "first_duration")) {
  dummy_df = provo_df %>% filter(metric == m) %>%
      do({get_d_points(.)}) %>%
      filter(x>0, x<20)
  density_data = rbind(density_data, dummy_df %>% mutate(metric=m))
}
```

```{r}
# Surprisal curves
  ggplot() +
      # Density Data
      annotate("rect", xmin=0, xmax=20, ymin=-20,ymax=-10, fill="#f4f4f4", color="grey", alpha=1, size = 0) +
      geom_line(data = density_data, aes(x=x, y=y*50 - 18), color="#aaaaaa", size = 0.4) +
      # Surrp / Rt data
      geom_line(data = smooths_df, aes(x=prev_surp, y=y, color = psychometric), size=0.7) +
      geom_ribbon(data = smooths_df, aes(x=prev_surp, ymin=y_lower, ymax=y_upper, fill = psychometric), alpha=0.3, size=0.5) +
      scale_x_continuous(labels=c(0, 10, 20), breaks=c(0, 10, 20), minor_breaks = NULL) +
      facet_wrap(psychometric~., nrow = 1) +
      ylab("Slowdown due to Surprisal (ms)") +
      xlab("Surprisal of Word") +
      ggtitle("MoTR Times and Previous Word Surprisal")
  theme(
    legend.position = "none",
    panel.grid.minor = element_blank()
  )
```

## precision and recall for FPReg
```{r}
FPReg_df = provo_df %>% filter(metric == "FPReg") %>% spread(measure, value)
# write.csv(FPReg_df, file = "/Users/cui/Desktop/MoTR/pipeline/ancillary_data/FPReg.csv", row.names = FALSE)
confusion_matrix <- table(FPReg_df$motr_value > 0, FPReg_df$eyetr_value > 0)
confusion_matrix

true_positives <- confusion_matrix[2, 2]
false_positives <- confusion_matrix[2, 1]
false_negatives <- confusion_matrix[1, 2]

precision <- true_positives / (true_positives + false_positives)
recall <- true_positives / (true_positives + false_negatives)

print("precision of Motr FPReg:")
print(precision)
print("Recall of Motr FPReg:")
print(recall)
```

## Relationship between first duration and regression probabilities.  --> exploratory, not finished.

```{r}
# Motr --> fit a linear model for first_duration. finding: More FPReg, more first duration. 

rlt_df = provo_df %>% filter(metric == "FPReg" | metric == "first_duration") %>% 
  filter(surp > 10) %>%
  spread(metric, value) %>%
  drop_na()

# View(rlt_df)

motr_rlt = rlt_df %>% filter(measure == "motr_value") %>%
  mutate(FPReg_bi = if_else(FPReg == 0, 0, 1),
         fd_normalized = first_duration / len)

motr_rlt$FPReg_bi <- factor(motr_rlt$FPReg_bi)

# View(motr_rlt)

# levels(motr_rlt$FPReg_bi)

# motr_lm = lm(first_duration ~ freq + FPReg_bi, data = motr_rlt)
motr_lm = lm(fd_normalized ~ FPReg, data = motr_rlt)

summary(motr_lm)

# write.csv(rlt_df, file = "/Users/cui/Desktop/MoTR/pipeline/ancillary_data/rlt.csv", row.names = FALSE)

```


```{r}
# eyrtr -> same as motr, more FPReg, more first_duration

eyetr_rlt = rlt_df %>% filter(measure == "eyetr_value") %>%
  mutate(FPReg_bi = if_else(FPReg == 0, 0, 1),
         fd_normalized = first_duration / len)

eyetr_rlt$FPReg_bi <- factor(eyetr_rlt$FPReg_bi)

# eyetr_lm = lm(first_duration ~ FPReg_bi, data = eyetr_rlt)
eyetr_lm = lm(fd_normalized ~ FPReg, data = eyetr_rlt)

summary(eyetr_lm)

```



